#pacmanをインストール
if (!require(pacman)) {
install.packages("pacman")
}
#tidyvreseをロード
pacman::p_load(readxl, tidyverse, here, kableExtra, gt, modelsummary)RA Bootcamp Warmup
Analysis
パッケージの読み込み
1 記述統計
Masterの読み込み
#csvの読み込み
master <- read_csv(here("master.csv"))# A tibble: 6 × 23
unitid instnm semester quarter year yearofsem after totcohortsize
<dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 100654 ALABAMA A&M UNIVE… 1 0 1991 NA NA 1010
2 100654 ALABAMA A&M UNIVE… 1 0 1992 NA NA 876
3 100654 ALABAMA A & M UNI… 1 0 1993 NA NA 1019
4 100654 ALABAMA A & M UNI… 1 0 1995 NA NA 849
5 100654 ALABAMA A & M UNI… 1 0 1996 NA NA 716
6 100654 ALABAMA A & M UNI… 1 0 1997 NA NA 789
# ℹ 15 more variables: w_cohortsize <dbl>, m_cohortsize <dbl>,
# tot4yrgrads <dbl>, m_4yrgrads <dbl>, w_4yrgrads <dbl>,
# women_gradrate_4yr <dbl>, womengradrate4yr <dbl>, gradrate4yr <dbl>,
# mengradrate4yr <dbl>, instatetuition <dbl>, costs <dbl>, faculty <dbl>,
# white_cohortsize <dbl>, per_white_cohort <dbl>, per_women_cohort <dbl>
1.1 各列に含まれるNAの数を数える
#列ごとのNAをカウント
numbers_NA <- as.data.frame(sapply(master, function(x) sum(is.na(x)))) |>
rename("number_NA" = "sapply(master, function(x) sum(is.na(x)))")
#行と列を入れ替え
numbers_NA <- as.data.frame(t(numbers_NA))# A tibble: 1 × 23
unitid instnm semester quarter year yearofsem after totcohortsize
<int> <int> <int> <int> <int> <int> <int> <int>
1 0 0 0 0 0 12844 12844 0
# ℹ 15 more variables: w_cohortsize <int>, m_cohortsize <int>,
# tot4yrgrads <int>, m_4yrgrads <int>, w_4yrgrads <int>,
# women_gradrate_4yr <int>, womengradrate4yr <int>, gradrate4yr <int>,
# mengradrate4yr <int>, instatetuition <int>, costs <int>, faculty <int>,
# white_cohortsize <int>, per_white_cohort <int>, per_women_cohort <int>
1.2 要約統計の作成
#要約統計を作成
summary_master <- master |>
group_by(switcher = if_else(!is.na(after), "Switchers", "Never switchers")) |>
summarise(
`Semester calendar` = paste0(round(mean(semester, na.rm = TRUE), 2), " (", round(sd(semester, na.rm = TRUE), 2), ")"),
`Four-year graduation rate` = paste0(round(mean(gradrate4yr, na.rm = TRUE), 2), " (", round(sd(gradrate4yr, na.rm = TRUE), 2), ")"),
`Four-year women graduation rate` = paste0(round(mean(womengradrate4yr, na.rm = TRUE), 2), " (", round(sd(womengradrate4yr, na.rm = TRUE), 2), ")"),
`Four-year men graduation rate` = paste0(round(mean(mengradrate4yr, na.rm = TRUE), 2), " (", round(sd(mengradrate4yr, na.rm = TRUE), 2), ")"),
`Cohort size` = paste0(round(mean(totcohortsize, na.rm = TRUE), 2), " (", round(sd(totcohortsize, na.rm = TRUE), 2), ")"),
`Full-time-equivalent faculty` = paste0(round(mean(faculty, na.rm = TRUE), 2), " (", round(sd(faculty, na.rm = TRUE), 2), ")"),
`In-state tuition` = paste0(round(mean(instatetuition, na.rm = TRUE), 2), " (", round(sd(instatetuition, na.rm = TRUE), 2), ")")
) |>
bind_rows(master |>
summarise(
`Semester calendar` = paste0(round(mean(semester, na.rm = TRUE), 2), " (", round(sd(semester, na.rm = TRUE), 2), ")"),
`Four-year graduation rate` = paste0(round(mean(gradrate4yr, na.rm = TRUE), 2), " (", round(sd(gradrate4yr, na.rm = TRUE), 2), ")"),
`Four-year women graduation rate` = paste0(round(mean(womengradrate4yr, na.rm = TRUE), 2), " (", round(sd(womengradrate4yr, na.rm = TRUE), 2), ")"),
`Four-year men graduation rate` = paste0(round(mean(mengradrate4yr, na.rm = TRUE), 2), " (", round(sd(mengradrate4yr, na.rm = TRUE), 2), ")"),
`Cohort size` = paste0(round(mean(totcohortsize, na.rm = TRUE), 2), " (", round(sd(totcohortsize, na.rm = TRUE), 2), ")"),
`Full-time-equivalent faculty` = paste0(round(mean(faculty, na.rm = TRUE), 2), " (", round(sd(faculty, na.rm = TRUE), 2), ")"),
`In-state tuition` = paste0(round(mean(instatetuition, na.rm = TRUE), 2), " (", round(sd(instatetuition, na.rm = TRUE), 2), ")")
) |> mutate(switcher = "All")
) |>
arrange(match(switcher, c("All", "Never switchers", "Switchers")))
#行と列を入れ替え
summary_master <- as.data.frame(t(summary_master))
#列名を変更
colnames(summary_master) <- summary_master[1, ]
summary_master <- summary_master[-1, ]
#表を作成
summary_master |>
kbl(caption = "Table 1—Institution-Level Summary Statistics", format = "html") |>
kable_styling(bootstrap_options = "condensed",
full_width = FALSE,
font_size = 14,
position = "center",
html_font = "Times New Roman") |>
column_spec(1, width = "200px") |>
column_spec(2:4, width = "110px", extra_css = "text-align: center;") |>
row_spec(0, bold = TRUE, extra_css = "text-align: center; vertical-align: middle;",
hline_after = TRUE) |>
row_spec(1:nrow(summary_master), extra_css = "height: 40px; vertical-align: middle;") |>
footnote(general = "The balanced panel dataset includes the 1991–2010 entering cohorts. There are 731 institutions and 19 years. An observation is an institution year. Standard deviations are reported in parentheses.",
general_title = "Notes:",
footnote_as_chunk = TRUE)| All | Never switchers | Switchers | |
|---|---|---|---|
| Semester calendar | 0.93 (0.25) | 0.95 (0.22) | 0.7 (0.46) |
| Four-year graduation rate | 0.37 (0.23) | 0.38 (0.23) | 0.27 (0.18) |
| Four-year women graduation rate | 0.41 (0.23) | 0.42 (0.23) | 0.32 (0.2) |
| Four-year men graduation rate | 0.32 (0.23) | 0.33 (0.23) | 0.22 (0.18) |
| Cohort size | 1099.45 (1183.03) | 1084.86 (1170.03) | 1278.78 (1319.97) |
| Full-time-equivalent faculty | 340 (382.59) | 335.03 (377.78) | 401.04 (432.91) |
| In-state tuition | 11088.47 (9181.55) | 11375.81 (9238.61) | 7556.8 (7612.64) |
| Notes: The balanced panel dataset includes the 1991–2010 entering cohorts. There are 731 institutions and 19 years. An observation is an institution year. Standard deviations are reported in parentheses. |
1.3 4年卒業率の平均推移をプロット
#4年卒業率を計算
summary_semesterrate <- master |>
group_by(year) |>
summarize(fraction_on_semesters = mean(semester, na.rm = TRUE))
# グラフをプロット
summary_semesterrate |>
ggplot(aes(x = year)) +
geom_line(aes(y = fraction_on_semesters), color = "black", size = 0.5) +
scale_y_continuous(
name = "4-year graduation rate",
limits = c(0.8, 1)
) +
labs(
title = "Figure 1. Four-Year Graduation Rates",
x = "Year"
) +
theme_minimal()+
theme(
plot.title = element_text(hjust = 0.5, family = "serif"),
panel.grid = element_blank(),
axis.line = element_line(),
axis.ticks = element_line(),
axis.title.y.left = element_text(margin = margin(r = 10)),
axis.title.y.right = element_text(margin = margin(l = 10))
)1.4 semester制導入率の平均推移をプロット
#4年卒業率を計算
summary_gradrate4yr <- master |>
group_by(year) |>
summarize(avg_gradrate4yr = mean(gradrate4yr, na.rm = TRUE))
# グラフをプロット
summary_gradrate4yr |>
ggplot(aes(x = year)) +
geom_line(aes(y = avg_gradrate4yr), color = "black", size = 0.5) +
scale_y_continuous(
name = "Fraction of schools on semesters",
limits = c(0.25, 0.45)
) +
labs(
title = "Figure 2. Fraction of Schools on Semesters",
x = "Year"
) +
theme_minimal()+
theme(
plot.title = element_text(hjust = 0.5, family = "serif"),
panel.grid = element_blank(),
axis.line = element_line(),
axis.ticks = element_line(),
axis.title.y.left = element_text(margin = margin(r = 10)),
axis.title.y.right = element_text(margin = margin(l = 10))
)1.5 4年卒業率とsemester制導入率の平均推移を同時にプロット
#要約統計量を計算
summary_rates <- master |>
group_by(year) |>
summarize(fraction_on_semesters = mean(semester, na.rm = TRUE),
avg_gradrate4yr = mean(gradrate4yr, na.rm = TRUE))
# グラフをプロット
summary_rates |>
ggplot(aes(x = year)) +
geom_line(aes(y = fraction_on_semesters, linetype = "Fraction of schools on semesters"), color = "black", size = 0.5) +
geom_line(aes(y = avg_gradrate4yr + 0.55, linetype = "4-year graduation rate"), color = "black", size = 0.5) +
scale_y_continuous(
name = "Fraction of schools on semesters",
limits = c(0.8, 1), # Adjust limits as necessary
sec.axis = sec_axis(~. -0.55, name = "4-year graduation rate" )
) +
scale_linetype_manual(
values = c("Fraction of schools on semesters" = "solid",
"4-year graduation rate" = "dashed")
) +
labs(
title = "Figure 3. Fraction of Schools on Semesters and Four-Year Graduation Rates",
x = "Year",
linetype = "Legend"
) +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5, family = "serif"),
legend.position = "bottom",
panel.grid = element_blank(),
axis.line = element_line(),
axis.ticks = element_line(),
axis.title.y.left = element_text(margin = margin(r = 10)),
axis.title.y.right = element_text(margin = margin(l = 10))
)1.6 散布図を作成
# 散布図を作成する関数
create_scatter_plot <- function(data, x_col, y_col) {
x_col <- enquo(x_col)
y_col <- enquo(y_col)
ggplot(data, aes(x = !!x_col, y = !!y_col)) +
geom_point(color = "blue", alpha = 0.2) +
labs(x = quo_name(x_col), y = quo_name(y_col)) +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5, family = "serif"))
}
plot1 <- create_scatter_plot(master, gradrate4yr, per_women_cohort) + labs(title = "Figure 4a. Four-Year Graduation Rates vs Female Students Ratio")
plot2 <- create_scatter_plot(master, gradrate4yr, per_white_cohort) + labs(title = "Figure 4b. Four-Year Graduation Rates vs White Students Ratio")
plot3 <- create_scatter_plot(master, gradrate4yr, instatetuition) + labs(title = "Figure 4c. Four-Year Graduation Rates vs In-state Tuition")2 回帰分析
2.1 次の回帰式を推定
lm(formula = gradrate4yr ~ after, data = master) |>
modelsummary()| (1) | |
|---|---|
| (Intercept) | 0.251 |
| (0.010) | |
| after | 0.031 |
| (0.012) | |
| Num.Obs. | 1045 |
| R2 | 0.006 |
| R2 Adj. | 0.005 |
| AIC | -577.3 |
| BIC | -562.4 |
| Log.Lik. | 291.633 |
| RMSE | 0.18 |